use curatedMetagenomicData.tsv to explore data
library(tidyverse)
library(readxl)
df <- read_tsv("raw-data/curatedMetagenomicData.tsv")
Rows: 20283 Columns: 130
── Column specification ─────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (57): study_name, sample_id, subject_id, body_site, antibiotics_current_use, study_co...
dbl (73): age, infant_age, PMID, number_reads, number_bases, minimum_read_length, median_...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df %>%
group_by(study_name,study_condition) %>%
summarise(count = n())
`summarise()` has grouped output by 'study_name'. You can override using the `.groups` argument.
find studies that included an internal control
df1 <- df %>%
group_by(study_name,study_condition) %>%
summarise(count = n()) %>%
select(study_name) %>% as.list()
vec <- df1$study_name
studyNamesWithCtr <- vec[ave(seq_along(vec), vec, FUN = length)>1] %>% unique()
studyNamesWithCtr
[1] "BrooksB_2017" "Castro-NallarE_2015" "ChngKR_2016"
[4] "DavidLA_2015" "FengQ_2015" "GhensiP_2019"
[7] "GuptaA_2019" "HallAB_2017" "HanniganGD_2017"
[10] "Heitz-BuschartA_2016" "HMP_2019_ibdmdb" "HMP_2019_t2d"
[13] "IjazUZ_2017" "JieZ_2017" "KarlssonFH_2013"
[16] "KieserS_2018" "KosticAD_2015" "LiJ_2014"
[19] "LiJ_2017" "LiSS_2016" "LoombaR_2017"
[22] "NagySzakalD_2017" "NielsenHB_2014" "QinJ_2012"
[25] "QinN_2014" "RaymondF_2016" "RosaBA_2018"
[28] "RubelMA_2020" "SankaranarayananK_2015" "ShiB_2015"
[31] "TettAJ_2016" "ThomasAM_2018a" "ThomasAM_2018b"
[34] "ThomasAM_2019_c" "VatanenT_2016" "VincentC_2016"
[37] "VogtmannE_2016" "WirbelJ_2018" "XieH_2016"
[40] "YachidaS_2019" "YeZ_2018" "YuJ_2015"
[43] "ZellerG_2014" "ZhuF_2020"
now that I have the list of studies with controls, I’ll check the first to make sure my search is accurate
df %>%
group_by(study_name,study_condition) %>%
summarise(count = n()) %>%
filter(study_name == studyNamesWithCtr[1])
the list seems to have worked! Now find CRC studies that include internal Ctrs
studiesCRCwithCtr <- df %>%
group_by(study_name,study_condition) %>%
summarise(count = n()) %>%
filter(study_name %in% studyNamesWithCtr[1:length(studyNamesWithCtr)]) %>%
filter(study_condition == "CRC")
studiesCRCwithCtr
Save table
now that I have a table with only CRC studies with included Ctrs, I’ll look at their Metadata
studiesCRCwithCtr$study_name[1]
[1] "FengQ_2015"
df %>%
filter(study_name == studiesCRCwithCtr$study_name[1])
This study is the first study from list with CRC and internal Ctrs. Although there is no annotation in the curatedMetagenomic table, the original article seem to have some I see if I can pull the table and merge. Below is a bunch of code that clean up Clinical table downloaed from original manuscript
# I confirmed there is lots of data in the published dataset
# curate the table
mdFengQ_2015_SupplData1_1 <- read_excel("raw-data/FengQ_2015_PMID25758642/41467_2015_BFncomms7528_MOESM1193_ESM.xlsx",
skip = 2)
newHeader <- str_replace(string = names(mdFengQ_2015_SupplData1_1),pattern = "\\.\\.\\..+",replacement = "")
for (i in 1:length(newHeader)){
hold <- newHeader[i]
if (newHeader[i+1] == ""){
newHeader[i+1] <- hold
}
# break check
if (i==51) break
}
names(mdFengQ_2015_SupplData1_1) <- paste0(newHeader,"_",mdFengQ_2015_SupplData1_1[1,])
mdFengQ_2015_SupplData1_2 <- mdFengQ_2015_SupplData1_1[-1,]
names(mdFengQ_2015_SupplData1_2)[48] <- "MLG classifier_Type_Probability of carcinoma"
names(mdFengQ_2015_SupplData1_2)[50] <- "MLG classifier_Type_Probability of advanced adenoma"
names(mdFengQ_2015_SupplData1_2)[52] <- "MLG classifier_Type_Probability of advanced adenoma add age"
names(mdFengQ_2015_SupplData1_2)[1] <- "Sample ID"
names(mdFengQ_2015_SupplData1_2) <- str_replace_all(string = names(mdFengQ_2015_SupplData1_2),pattern = "[[:space:]]|\\(|\\/|\\)|\\:",replacement = "_")
mdFengQ_2015_SupplData1_3 <- mdFengQ_2015_SupplData1_2 %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,3:25]), as.integer)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,26:31]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,32:38]), as.integer)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,39:46]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,47]), as.numeric)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,48]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,49]), as.numeric)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_2[,40]), as.factor))
mdFengQ_2015_SupplData1_3 <- mdFengQ_2015_SupplData1_3 %>%
mutate(Clinical_data_State = factor(Clinical_data_State, levels=c("controls", "advanced adenoma","carcinoma")))
#edit tables as some muberic are actually factors
mdFengQ_2015_SupplData1_4 <- mdFengQ_2015_SupplData1_3 %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,7:9]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,16:17]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,24:37]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,39:46]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,48]), as.factor)) %>%
mutate(across(names(mdFengQ_2015_SupplData1_3[,50]), as.factor))
# I have now a well formatted table
mdFengQ_2015_SupplData1_4
I can now vidualize some data for exploration purpose
# Age across Status
mdFengQ_2015_SupplData1_3 %>%
mutate(Clinical_data_State = factor(Clinical_data_State, levels=c("controls", "advanced adenoma","carcinoma"))) %>%
ggplot(aes(x = Clinical_data_State,y = `Clinical_data_Age_(yrs)`)) +
geom_violin(trim = T,na.rm = T) +
geom_jitter(width = 0.2,size = 1) +
theme_classic()
# BMI across Status
mdFengQ_2015_SupplData1_3 %>%
mutate(Clinical_data_State = factor(Clinical_data_State, levels=c("controls", "advanced adenoma","carcinoma"))) %>%
ggplot(aes(x = Clinical_data_State,y = Clinical_data_BMI)) +
geom_violin(trim = T,na.rm = T) +
geom_jitter(width = 0.2,size = 1) +
theme_classic()
# Sex across Status
mdFengQ_2015_SupplData1_3 %>%
mutate(Clinical_data_State = factor(Clinical_data_State, levels=c("controls", "advanced adenoma","carcinoma"))) %>%
group_by(Clinical_data_State,`Clinical_data_Gender_(1:male,_2:female)`) %>%
summarise(count = n())
# GGT across Status
mdFengQ_2015_SupplData1_3 %>%
mutate(Clinical_data_State = factor(Clinical_data_State, levels=c("controls", "advanced adenoma","carcinoma"))) %>%
ggplot(aes(x = Clinical_data_State,y = `Clinical_data_GGT_(U/L)`)) +
geom_violin(trim = T,na.rm = T) +
geom_jitter(width = 0.2,size = 1) +
theme_classic()
Part of the future work will include the identificationof correlative values between certain Taxa or BGC and metadata. To speed up representation, I divide the variables (columns) between continous and factorial, and adopt differnet plotting strategy to identify linear regression
# Loop to print multiple exploratory Plots
## classify variables are numeric or factorial
numericVar <- mdFengQ_2015_SupplData1_4 %>%
select_if(.predicate = is.numeric) %>%
names()
factorialVar <- mdFengQ_2015_SupplData1_4 %>%
select_if(.predicate = is.factor) %>%
names()
#give name to each value in the vectors
numericVar = set_names(numericVar)
factorialVar = set_names(factorialVar)
# make plotting functions
## for continuous data, use scatter
scatter_fun = function(data,x,y){
ggplot(data, aes(x = .data[[x]], y = .data[[y]])) +
geom_point(na.rm=TRUE) +
geom_smooth(method='lm',na.rm=TRUE) +
theme_bw() +
theme(axis.text = element_blank(),axis.title = element_text(size = 5))
}
## for factorial data, use violin
violin_fun = function(data,x, y) {
ggplot(data, aes(x = .data[[x]], y = .data[[y]]) ) +
geom_violin() +
geom_jitter(size = 1, alpha = 0.5, width = 0.2) +
theme_bw()
}
numericVar
Sample_ID
"Sample_ID"
Clinical_data_Age__yrs_
"Clinical_data_Age__yrs_"
Clinical_data_BMI
"Clinical_data_BMI"
Clinical_data_Waist__cm_
"Clinical_data_Waist__cm_"
Clinical_data_Hip__cm_
"Clinical_data_Hip__cm_"
Clinical_data_GGT__U_L_
"Clinical_data_GGT__U_L_"
Clinical_data_GOT__AST___U_L_
"Clinical_data_GOT__AST___U_L_"
Clinical_data_GPT__ALT___U_L_
"Clinical_data_GPT__ALT___U_L_"
Clinical_data_Fasting_insulin__µU_mL_
"Clinical_data_Fasting_insulin__µU_mL_"
Clinical_data_Fasting_glucose__mg_L_
"Clinical_data_Fasting_glucose__mg_L_"
Clinical_data_HOMA_index
"Clinical_data_HOMA_index"
Clinical_data_CRP__mg_L_
"Clinical_data_CRP__mg_L_"
Clinical_data_Ferritin__ng_mL_
"Clinical_data_Ferritin__ng_mL_"
Clinical_data_Hb__g_L_
"Clinical_data_Hb__g_L_"
Clinical_data_TG__mg_L_
"Clinical_data_TG__mg_L_"
Clinical_data_HDL__mg_L_
"Clinical_data_HDL__mg_L_"
Clinical_data_LDL__mg_L_
"Clinical_data_LDL__mg_L_"
Fiber_intake_g_week
"Fiber_intake_g_week"
MLG_classifier_Probability_of_carcinoma
"MLG_classifier_Probability_of_carcinoma"
MLG_classifier_Probability_of_advanced_adenoma
"MLG_classifier_Probability_of_advanced_adenoma"
factorialVar
Clinical_data_State
"Clinical_data_State"
Clinical_data_WHR__waist-hip-ratio_
"Clinical_data_WHR__waist-hip-ratio_"
Clinical_data_Gender__1_male,_2_female_
"Clinical_data_Gender__1_male,_2_female_"
Clinical_data_Fatty_liver_in_ultrasound_1_yes,_0_no_
"Clinical_data_Fatty_liver_in_ultrasound_1_yes,_0_no_"
Clinical_data_Diabetes__1_yes,_0__no_
"Clinical_data_Diabetes__1_yes,_0__no_"
Clinical_data_Hba1C__%_
"Clinical_data_Hba1C__%_"
Clinical_data_Hypertension__1_yes,_0_no_
"Clinical_data_Hypertension__1_yes,_0_no_"
Clinical_data_MetS__1_yes,_0_no_
"Clinical_data_MetS__1_yes,_0_no_"
Clinical_data_TNM_-_Classification_of_carcinoma
"Clinical_data_TNM_-_Classification_of_carcinoma"
Clinical_data_Histology
"Clinical_data_Histology"
Clinical_data_Localization_in_colon__right_left_rectum_
"Clinical_data_Localization_in_colon__right_left_rectum_"
Fast_food_Every_day
"Fast_food_Every_day"
Fast_food_Every_week
"Fast_food_Every_week"
Fast_food_Less_than_every_week_or_never
"Fast_food_Less_than_every_week_or_never"
Meat_in_g_week_Red_meat__pork,_beef,_veal,_venison_
"Meat_in_g_week_Red_meat__pork,_beef,_veal,_venison_"
Meat_in_g_week_White_meat__chicken,_turkey_
"Meat_in_g_week_White_meat__chicken,_turkey_"
Meat_in_g_week_Total__all_meat,_i.e._red,_white_and_other_
"Meat_in_g_week_Total__all_meat,_i.e._red,_white_and_other_"
Fish_g_week
"Fish_g_week"
Vegetables_g_week
"Vegetables_g_week"
Fruits_g_week
"Fruits_g_week"
Smoking_0_never,_1_ever
"Smoking_0_never,_1_ever"
Smoking_Current
"Smoking_Current"
Smoking_1_more_than_or_equal_to_1_pack_day
"Smoking_1_more_than_or_equal_to_1_pack_day"
Smoking_1_more_than_or_equal_to_2_pack_day
"Smoking_1_more_than_or_equal_to_2_pack_day"
Physical_activity_1_low
"Physical_activity_1_low"
Physical_activity_1_intermediate
"Physical_activity_1_intermediate"
Physical_activity_1_high
"Physical_activity_1_high"
Physical_activity_1_physical_activity_available,_0_physical_activity_not_available
"Physical_activity_1_physical_activity_available,_0_physical_activity_not_available"
MLG_classifier_Type_Probability_of_carcinoma
"MLG_classifier_Type_Probability_of_carcinoma"
MLG_classifier_Type_Probability_of_advanced_adenoma
"MLG_classifier_Type_Probability_of_advanced_adenoma"
After having created vectros with either continous or factorial data (just name holders), and produced the desired scatter plot funciton, I can test the function outside a loop, unsing two arbitrary continous variables
# Try functions outside a loop
scatter_fun(data = mdFengQ_2015_SupplData1_4,
x = "Clinical_data_GPT__ALT___U_L_", y = "Clinical_data_CRP__mg_L_")
Then loop through all continous variables and cross-analyze all of them, and Show a single plot from the collection
# looping through only Scatter Plot data (continuous)
all_plots = map(numericVar, function(resp) {
map(numericVar, function(i) {
scatter_fun(data = mdFengQ_2015_SupplData1_4, x = i, y = resp)
})
})
# print single plot
all_plots$Clinical_data_Age__yrs_$Clinical_data_GGT__U_L_
ProduceGroup plots based on the same variables as a y-axis
colleciton_plots = map(all_plots, ~cowplot::plot_grid(plotlist = .x))
colleciton_plots$Clinical_data_Age__yrs_
colleciton_plots
$Sample_ID
$Clinical_data_Age__yrs_
$Clinical_data_BMI
$Clinical_data_Waist__cm_
$Clinical_data_Hip__cm_
$Clinical_data_GGT__U_L_
$Clinical_data_GOT__AST___U_L_
$Clinical_data_GPT__ALT___U_L_
$Clinical_data_Fasting_insulin__µU_mL_
$Clinical_data_Fasting_glucose__mg_L_
$Clinical_data_HOMA_index
$Clinical_data_CRP__mg_L_
$Clinical_data_Ferritin__ng_mL_
$Clinical_data_Hb__g_L_
$Clinical_data_TG__mg_L_
$Clinical_data_HDL__mg_L_
$Clinical_data_LDL__mg_L_
$Fiber_intake_g_week
$MLG_classifier_Probability_of_carcinoma
$MLG_classifier_Probability_of_advanced_adenoma
I can now proceed to produce visuals for factorial data (x-asis) vs continuos data (y-axis) using violin plots
## for facterial data, use violin
violin_fun = function(data,x, y) {
ggplot(data, aes(x = .data[[x]], y = .data[[y]]) ) +
geom_violin() +
geom_jitter(size = 1, alpha = 0.5, width = 0.2) +
theme_bw() +
theme(axis.text = element_blank(),axis.title = element_text(size = 5))
}
# Try functions outside a loop
violin_fun(data = mdFengQ_2015_SupplData1_4,
x = "Clinical_data_State", y = "Clinical_data_BMI")
Loop through
# looping through only Violin Plot data (cont vs Factorial)
all_plots2 = map(factorialVar, function(resp) {
map(numericVar, function(i) {
violin_fun(data = mdFengQ_2015_SupplData1_4, y = i, x = resp)
})
})
colleciton_plots2 = map(all_plots2, ~cowplot::plot_grid(plotlist = .x))
And print
colleciton_plots2
$Clinical_data_State
$`Clinical_data_WHR__waist-hip-ratio_`
$`Clinical_data_Gender__1_male,_2_female_`
$`Clinical_data_Fatty_liver_in_ultrasound_1_yes,_0_no_`
$`Clinical_data_Diabetes__1_yes,_0__no_`
$`Clinical_data_Hba1C__%_`
$`Clinical_data_Hypertension__1_yes,_0_no_`
$`Clinical_data_MetS__1_yes,_0_no_`
$`Clinical_data_TNM_-_Classification_of_carcinoma`
$Clinical_data_Histology
$Clinical_data_Localization_in_colon__right_left_rectum_
$Fast_food_Every_day
$Fast_food_Every_week
$Fast_food_Less_than_every_week_or_never
$`Meat_in_g_week_Red_meat__pork,_beef,_veal,_venison_`
$`Meat_in_g_week_White_meat__chicken,_turkey_`
$`Meat_in_g_week_Total__all_meat,_i.e._red,_white_and_other_`
$Fish_g_week
$Vegetables_g_week
$Fruits_g_week
$`Smoking_0_never,_1_ever`
$Smoking_Current
$Smoking_1_more_than_or_equal_to_1_pack_day
$Smoking_1_more_than_or_equal_to_2_pack_day
$Physical_activity_1_low
$Physical_activity_1_intermediate
$Physical_activity_1_high
$`Physical_activity_1_physical_activity_available,_0_physical_activity_not_available`
$MLG_classifier_Type_Probability_of_carcinoma
$MLG_classifier_Type_Probability_of_advanced_adenoma